First, let’s take a look at the data that is available to us.
How do you read a .csv into R?
# read in biographical data table
bio <- read_csv("https://raw.githubusercontent.com/majerus/apra_data_science_courses/master/bio_data_table.csv")
# read in giving data table
giving <- read_csv("https://raw.githubusercontent.com/majerus/apra_data_science_courses/master/giving_data_table.csv")
You can read multiple data files into the same R session. Each of these files contain fictional data created by the generate_data.R script.
Have you ever connected R to a database?
The following is an example of how to create a sample database in R and to load information from that database. More information on using databases from R can be found here. If you use the tidyverse, you can use the same workflow with information from databases and .csv or Excel files.
# create database connection
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":dbname:")
# put some data in our new database
copy_to(dest = con,
df = bio,
name = "bio_table",
temporary = FALSE)
copy_to(dest = con,
df = giving ,
name = "giving_table",
temporary = FALSE)
# print out our table names
db_list_tables(con)
## [1] "bio_table" "giving_table" "sqlite_stat1" "sqlite_stat4"
# let's take a look at the bio table
tbl(con, "bio_table")
## # Source: table<bio_table> [?? x 14]
## # Database: sqlite 3.30.1 []
## id name household_id country city birthday deceased zip state lat
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 5.96e6 Smit… 1000259 United… Lumb… -16640 Y 28358 NC 34.6
## 2 5.44e6 el-N… 1000279 United… West… -16613 Y 19382 PA 40.0
## 3 3.95e6 al-D… 1000279 United… Ronk… NA Y 11779 NY 40.8
## 4 4.13e6 Blac… 1000308 United… Colu… NA Y 43207 OH 40.0
## 5 7.81e6 Aten… 1000308 United… De p… -18532 Y 54115 WI 44.4
## 6 3.36e6 Ahma… 1000570 United… Gree… -16585 Y 54302 WI 44.5
## 7 1.98e6 Han,… 1000758 United… Phoe… -17676 <NA> 85035 AZ 33.5
## 8 7.67e6 Saxe… 1000758 United… Lake… -17442 <NA> 55041 MN 44.4
## 9 7.87e6 Pedr… 1000913 United… Harp… -16340 Y 48225 MI 42.4
## 10 2.03e6 Raib… 1000913 United… Manc… -18381 Y 03103 NH 43.0
## # … with more rows, and 4 more variables: lon <dbl>, capacity <chr>,
## # capacity_source <chr>, race <chr>
# we can use dplyr syntax to query a database
# dplyr automatically converts our r code to sql
# alternatively you can write sql code directly in rmarkdown as well
tbl(con, "bio_table") %>%
filter(state == "NC") %>%
select(name, city, capacity)
## # Source: lazy query [?? x 3]
## # Database: sqlite 3.30.1 []
## name city capacity
## <chr> <chr> <chr>
## 1 Smith, Katalina Lumberton $50k - $75K
## 2 Silvis, Eric Greenville $5k - $10k
## 3 Pinto, Anposahiyela Roxboro $10k - $25k
## 4 Sullivan, Janetta Troutman $2.5k - $5k
## 5 el-Mina, Husaam Huntersville $5k - $10k
## 6 Littlejohn, Colton Kinston $10k - $25k
## 7 Marrs, Demetrius Gastonia <NA>
## 8 Lovato, Lucero Rubi Statesville $2.5k - $5k
## 9 Black, Kenneth Salisbury $50k - $75K
## 10 Dinakar, Hyeon Harmony $10k - $25k
## # … with more rows
How do you take a look at your data in R?
bio %>%
glimpse()
## Rows: 100,000
## Columns: 14
## $ id <dbl> 5961718, 5443595, 3946937, 4129813, 7813954, 3362879,…
## $ name <chr> "Smith, Katalina", "el-Niazi, Khaleel", "al-Dib, Mutl…
## $ household_id <dbl> 1000259, 1000279, 1000279, 1000308, 1000308, 1000570,…
## $ country <chr> "United States", "United States", "United States", "U…
## $ city <chr> "Lumberton", "West chester", "Ronkonkoma", "Columbus"…
## $ birthday <date> 1924-06-11, 1924-07-08, NA, NA, 1919-04-07, 1924-08-…
## $ deceased <chr> "Y", "Y", "Y", "Y", "Y", "Y", NA, NA, "Y", "Y", "Y", …
## $ zip <chr> "28358", "19382", "11779", "43207", "54115", "54302",…
## $ state <chr> "NC", "PA", "NY", "OH", "WI", "WI", "AZ", "MN", "MI",…
## $ lat <dbl> 34.63, 39.95, 40.80, 39.98, 44.43, 44.52, 33.47, 44.4…
## $ lon <dbl> -79.01, -75.60, -73.12, -82.98, -88.07, -87.98, -112.…
## $ capacity <chr> "$50k - $75K", "$2.5k - $5k", "$2.5k - $5k", "$250k -…
## $ capacity_source <chr> "screening", "screening", "institutional", "instituti…
## $ race <chr> "Black or African American", "Non-Hispanic white", "T…
giving %>%
glimpse()
## Rows: 378,001
## Columns: 6
## $ `household ID` <dbl> 4986394, 8553925, 8553925, 9623708, 9623708, 3255528, …
## $ ID <dbl> 5539534, 6099545, 2847749, 9705490, 1758068, 5518600, …
## $ `gift id` <dbl> 2921084, 2921123, 2921123, 2921222, 2921222, 2921225, …
## $ `credit Type` <chr> "Soft-Credit", "Hard-Credit", "Soft-Credit", "Hard-Cre…
## $ `gift amt` <dbl> 476, 20086, 1258, 30377, 956, 1905, 1441, 1115, 87446,…
## $ `gift date` <date> 2019-08-24, 2016-12-20, 2019-08-13, 2017-04-08, 2018-…
Does anything look off about the giving data?
clean_names(giving)
## # A tibble: 378,001 x 6
## household_id id gift_id credit_type gift_amt gift_date
## <dbl> <dbl> <dbl> <chr> <dbl> <date>
## 1 4986394 5539534 2921084 Soft-Credit 476 2019-08-24
## 2 8553925 6099545 2921123 Hard-Credit 20086 2016-12-20
## 3 8553925 2847749 2921123 Soft-Credit 1258 2019-08-13
## 4 9623708 9705490 2921222 Hard-Credit 30377 2017-04-08
## 5 9623708 1758068 2921222 Soft-Credit 956 2018-04-26
## 6 3255528 5518600 2921225 Hard-Credit 1905 2019-04-11
## 7 3255528 5073590 2921225 Soft-Credit 1441 2017-06-02
## 8 1774374 3755877 2921299 Hard-Credit 1115 2016-03-02
## 9 1774374 5633291 2921299 Soft-Credit 87446 2017-01-24
## 10 3784534 7489954 2921305 Hard-Credit 1792 2018-02-23
## # … with 377,991 more rows
giving <- clean_names(giving)
R uses the NA code for missing values. You can test if a value is missing using the is.na() function.
How many missing values are there in the deceased variable?
is.na(bio$deceased)[1:100]
## [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] TRUE FALSE FALSE TRUE
sum(is.na(bio$deceased))
## [1] 10000
bio %>%
summarise(deceased_na = sum(is.na(deceased)))
## # A tibble: 1 x 1
## deceased_na
## <int>
## 1 10000
bio %>%
summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 x 14
## id name household_id country city birthday deceased zip state lat
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 10000 10000 10000 10000 10000
## # … with 4 more variables: lon <int>, capacity <int>, capacity_source <int>,
## # race <int>
giving %>%
summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 x 6
## household_id id gift_id credit_type gift_amt gift_date
## <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0
Which records are missing zip, state, lat, and lon?
bio %>%
filter(is.na(zip)) %>%
glimpse()
## Rows: 10,000
## Columns: 14
## $ id <dbl> 2307732, 5595799, 8993651, 4469674, 7406292, 6959349,…
## $ name <chr> "al-Hassan, Rihaab", "Pavisook, Bethany", "Hands, Isa…
## $ household_id <dbl> 1001089, 1003034, 1003287, 1003320, 1004096, 1004098,…
## $ country <chr> "Mexico", "China", "India", "Pakistan", "Pakistan", "…
## $ city <chr> "Mexico City", "Beijing", "Kolkata", "Faisalabad", "L…
## $ birthday <date> 1923-12-24, 1920-09-20, 1921-10-26, NA, 1919-09-09, …
## $ deceased <chr> "Y", "N", "Y", "Y", "Y", "Y", "N", "Y", "Y", "Y", "Y"…
## $ zip <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ state <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ lon <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ capacity <chr> "$2.5k - $5k", "$500k - $750k", "$500k - $750k", "$5k…
## $ capacity_source <chr> "institutional", "screening", "screening", "screening…
## $ race <chr> "Non-Hispanic white", "Non-Hispanic white", "Hispanic…
The zipcode package can be used to get lat/lon coordinates for each zipcode’s centroid in the US. This data is also available here.
You can treat character, numeric, and factor variables seperately using variations of the select function.
bio %>%
select_if(is.numeric)
## # A tibble: 100,000 x 4
## id household_id lat lon
## <dbl> <dbl> <dbl> <dbl>
## 1 5961718 1000259 34.6 -79.0
## 2 5443595 1000279 40.0 -75.6
## 3 3946937 1000279 40.8 -73.1
## 4 4129813 1000308 40.0 -83.0
## 5 7813954 1000308 44.4 -88.1
## 6 3362879 1000570 44.5 -88.0
## 7 1980892 1000758 33.5 -112.
## 8 7674576 1000758 44.4 -92.3
## 9 7870237 1000913 42.4 -82.9
## 10 2026476 1000913 43.0 -71.4
## # … with 99,990 more rows
bio %>%
select_if(is.character)
## # A tibble: 100,000 x 9
## name country city deceased zip state capacity capacity_source race
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Smith,… United … Lumbe… Y 28358 NC $50k - … screening Black …
## 2 el-Nia… United … West … Y 19382 PA $2.5k -… screening Non-Hi…
## 3 al-Dib… United … Ronko… Y 11779 NY $2.5k -… institutional Two or…
## 4 Black,… United … Colum… Y 43207 OH $250k -… institutional Non-Hi…
## 5 Atenci… United … De pe… Y 54115 WI $75k - … screening Non-Hi…
## 6 Ahmad,… United … Green… Y 54302 WI $75k - … screening Non-Hi…
## 7 Han, L… United … Phoen… <NA> 85035 AZ $2.5k -… institutional Non-Hi…
## 8 Saxer,… United … Lake … <NA> 55041 MN $25k - … screening Non-Hi…
## 9 Pedraz… United … Harpe… Y 48225 MI $25k - … screening Hispan…
## 10 Raibur… United … Manch… Y 03103 NH $25k - … institutional Non-Hi…
## # … with 99,990 more rows
Which variable is but should not be a character?
bio %>%
select_if(is.character)
## # A tibble: 100,000 x 9
## name country city deceased zip state capacity capacity_source race
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Smith,… United … Lumbe… Y 28358 NC $50k - … screening Black …
## 2 el-Nia… United … West … Y 19382 PA $2.5k -… screening Non-Hi…
## 3 al-Dib… United … Ronko… Y 11779 NY $2.5k -… institutional Two or…
## 4 Black,… United … Colum… Y 43207 OH $250k -… institutional Non-Hi…
## 5 Atenci… United … De pe… Y 54115 WI $75k - … screening Non-Hi…
## 6 Ahmad,… United … Green… Y 54302 WI $75k - … screening Non-Hi…
## 7 Han, L… United … Phoen… <NA> 85035 AZ $2.5k -… institutional Non-Hi…
## 8 Saxer,… United … Lake … <NA> 55041 MN $25k - … screening Non-Hi…
## 9 Pedraz… United … Harpe… Y 48225 MI $25k - … screening Hispan…
## 10 Raibur… United … Manch… Y 03103 NH $25k - … institutional Non-Hi…
## # … with 99,990 more rows
bio <-
bio %>%
mutate(zip = as.numeric(zip))
How might we recode the missing values for the deceased variable?
bio <-
bio %>%
mutate(deceased_missing = ifelse(is.na(deceased), "Y", "N"),
deceased = ifelse(is.na(deceased), "N", deceased))
Are certain capacity sources missing capacity information?
# capacity source
bio %>%
count(capacity, capacity_source)
## # A tibble: 49 x 3
## capacity capacity_source n
## <chr> <chr> <int>
## 1 >$1k institutional 1940
## 2 >$1k screening 2983
## 3 >$1k <NA> 520
## 4 $100k - $250k institutional 1645
## 5 $100k - $250k screening 2486
## 6 $100k - $250k <NA> 469
## 7 $10k - $25k institutional 4840
## 8 $10k - $25k screening 7416
## 9 $10k - $25k <NA> 1377
## 10 $10M - $25M institutional 1
## # … with 39 more rows
bio %>%
filter(is.na(capacity_source)) %>%
count(capacity, capacity_source)
## # A tibble: 15 x 3
## capacity capacity_source n
## <chr> <chr> <int>
## 1 >$1k <NA> 520
## 2 $100k - $250k <NA> 469
## 3 $10k - $25k <NA> 1377
## 4 $10M - $25M <NA> 2
## 5 $1k - $2.5k <NA> 525
## 6 $1M - $2.5M <NA> 7
## 7 $2.5k - $5k <NA> 938
## 8 $250k - $500k <NA> 492
## 9 $25k - $50k <NA> 1385
## 10 $500k - $750k <NA> 461
## 11 $50k - $75K <NA> 883
## 12 $5k - $10k <NA> 855
## 13 $750k - $1M <NA> 240
## 14 $75k - $100k <NA> 891
## 15 <NA> <NA> 955
Let’s take a closer look at the birthday variable. What do you notice when we sort all birthdays in order?
# birthdays - let's sort all the birthdays in order
bio %>%
select(birthday, deceased) %>%
arrange(birthday)
## # A tibble: 100,000 x 2
## birthday deceased
## <date> <chr>
## 1 1900-01-01 Y
## 2 1900-01-01 Y
## 3 1900-01-01 Y
## 4 1900-01-01 Y
## 5 1900-01-01 Y
## 6 1900-01-01 Y
## 7 1900-01-01 Y
## 8 1900-01-01 Y
## 9 1900-01-01 Y
## 10 1900-01-01 Y
## # … with 99,990 more rows
# let's take a look at the distribution of birthdays
bio %>%
select(birthday) %>%
ggplot(aes(x = birthday)) +
geom_histogram()
# let's clean up what appears to be a missing value indicator
bio <-
bio %>%
mutate(birthday = if_else(birthday == as.Date("1/1/1900", "%m/%d/%Y"),
as.Date(NA),
birthday))
# let's take another look
bio %>%
select(birthday) %>%
ggplot(aes(x = birthday)) +
geom_histogram()
# bio table - character variables bar plots
bio %>%
select_if(is.character) %>%
select(-name, -city) %>%
gather("variable", "value") %>%
ggplot(aes(x = value)) +
geom_bar() +
facet_wrap(~variable, scales = "free", nrow = 7) +
theme(axis.text.y = element_text(size = 6)) +
coord_flip()
What looks strange?
# clean capacity ratings
sort(unique(bio$capacity))
## [1] ">$1k" "$100k - $250k" "$10k - $25k" "$10M - $25M"
## [5] "$1k - $2.5k" "$1M - $2.5M" "$2.5k - $5k" "$250k - $500k"
## [9] "$25k - $50k" "$500k - $750k" "$50k - $75K" "$5k - $10k"
## [13] "$5M - $10M" "$750k - $1M" "$75k - $100k" "2.5M - $5M"
# demo multiple cursors
# [1] ">$1k"
# [2] "$100k - $250k"
# [3] "$10k - $25k"
# [4] "$10M - $25M"
# [5] "$1k - $2.5k"
# [6] "$1M - $2.5M"
# [7] "$2.5k - $5k"
# [8] "$250k - $500k"
# [9] "$25k - $50k"
# [10] "$25M - $50M"
# [11] "$500k - $750k"
# [12] "$50k - $75K"
# [13] "$5k - $10k"
# [14] "$5M - $10M"
# [15] "$750k - $1M"
# [16] "$75k - $100k"
bio <-
bio %>%
mutate(capacity = factor(capacity, levels = c(">$1k",
"$1k - $2.5k",
"$2.5k - $5k",
"$5k - $10k",
"$10k - $25k",
"$25k - $50k",
"$50k - $75K",
"$75k - $100k",
"$100k - $250k",
"$250k - $500k",
"$500k - $750k",
"$750k - $1M",
"$1M - $2.5M",
"2.5M - $5M",
"$5M - $10M",
"$10M - $25M",
"$25M - $50M")))
# let's take another look at those capacities
bio %>%
select(capacity) %>%
ggplot(aes(x = capacity)) +
geom_bar() +
coord_flip()
# state
bio %>%
filter(!is.na(state)) %>%
count(state) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(state, n), y = n)) +
geom_bar(stat = "identity", fill = "#027854") +
coord_flip() +
ggthemes::theme_tufte() +
labs(y = "Number of Prospects",
x = "Primary Residence State",
title = "Prospects by State")
Is this right? Do we need to exclude some prospects?
# state
state_plot <-
bio %>%
filter(!is.na(state),
deceased == "N",
!duplicated(household_id)) %>%
count(state) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(state, n), y = n)) +
geom_bar(stat = "identity", fill = "#027854") +
coord_flip() +
ggthemes::theme_tufte() +
labs(y = "Number of Prospects",
x = "Primary Residence State",
title = "Prospects by State")
ggplotly(state_plot)
How would we plot the distribution of gift dates (i.e., the number of gifts per day)?
# gifts per day
giving %>%
filter(credit_type == "Hard-Credit") %>%
ggplot(aes(x = gift_date)) +
geom_histogram()
How about the distribution of gift amounts?
giving %>%
filter(credit_type == "Hard-Credit") %>%
ggplot(aes(x = gift_amt)) +
geom_histogram()
giving %>%
filter(credit_type == "Hard-Credit",
gift_amt < 1000000) %>%
ggplot(aes(x = gift_amt)) +
geom_histogram()
giving %>%
filter(credit_type == "Hard-Credit",
gift_amt < 100000) %>%
ggplot(aes(x = gift_amt)) +
geom_histogram()
What’s our first step?
giving <-
giving %>%
mutate(fy = ifelse(month(gift_date) >= 7,
year(gift_date) +1,
year(gift_date)))
giving %>%
count(fy)
## # A tibble: 6 x 2
## fy n
## <dbl> <int>
## 1 2016 66498
## 2 2017 75654
## 3 2018 75187
## 4 2019 75578
## 5 2020 75984
## 6 2021 9100
giving %>%
filter(credit_type == "Hard-Credit") %>%
group_by(fy) %>%
summarise(total_giving = dollar(sum(gift_amt)))
## # A tibble: 6 x 2
## fy total_giving
## <dbl> <chr>
## 1 2016 $788,796,803
## 2 2017 $946,349,616
## 3 2018 $903,744,855
## 4 2019 $849,433,928
## 5 2020 $860,076,232
## 6 2021 $105,749,432
Is this it? What else might we need to account for?
calculateFY <- function(date = Sys.Date(), date.format = "%Y-%m-%d", ytd = FALSE,
fiscal.year = ifelse(month(Sys.Date()) >= 7, year(Sys.Date()) +1, year(Sys.Date()))){
date <- as.Date(date, date.format)
fy.date <-
ifelse(month(date) %in% c(1:6),
year(date),
year(date) + 1)
if(ytd == TRUE){
fy <- fiscal.year
end.this.fy <- as.Date(paste0("6/30/", fy), format = "%m/%d/%Y")
days.left.this.fy <- end.this.fy - Sys.Date()
end.date.fy <- as.Date(paste0("6/30/", fy.date), format = "%m/%d/%Y")
days.left.date.fy <- end.date.fy - date
if(days.left.date.fy >= days.left.this.fy){
return(fy.date)
}else{
return(NA)
}
}else{
return(fy.date)
}
}
# giving$fy <- unlist(lapply(giving$gift_date, function(x) suppressWarnings(calculateFY(x, ytd = TRUE))))
ytd_table <- tibble(
gift_date = seq(min(giving$gift_date), max(giving$gift_date), by = "day"),
)
ytd_table$fy_ytd <- unlist(lapply(ytd_table$gift_date, function(x) suppressWarnings(calculateFY(x, ytd = TRUE))))
giving %>%
left_join(ytd_table) %>%
filter(!is.na(fy_ytd)) %>%
filter(credit_type == "Hard-Credit") %>%
group_by(fy_ytd) %>%
summarise(total_giving = dollar(sum(gift_amt)))
## # A tibble: 5 x 2
## fy_ytd total_giving
## <dbl> <chr>
## 1 2017 $106,580,994
## 2 2018 $104,495,699
## 3 2019 $106,861,990
## 4 2020 $112,001,699
## 5 2021 $105,749,432
There is a fundraising R package in development that may help and is available here.
What might our first step be?
# calculate annual and total giving
# see who is not rated or rated low
giving_by_household_and_fy <-
giving %>%
group_by(household_id, fy) %>%
summarise(giving = sum(gift_amt)) %>%
spread(fy, giving, sep = "") %>%
ungroup() %>%
mutate(total_giving = rowSums(select(., contains("fy")), na.rm = TRUE))
sum(duplicated(giving_by_household_and_fy$household_id))
## [1] 0
bio_with_household_giving <-
bio %>%
filter(!duplicated(household_id)) %>%
left_join(giving_by_household_and_fy)
bio_with_household_giving %>%
filter(capacity_source %in% c(NA, "screening")) %>%
filter(total_giving > 10000) %>%
filter(!is.na(fy2019)) %>%
arrange(desc(total_giving)) %>%
select(name, capacity, capacity_source, contains("fy"), total_giving) %>%
datatable(rownames = FALSE) %>%
formatCurrency(columns = c(3:10), digits = 0)
What might our first step be?
bio_with_household_giving %>%
filter(total_giving > 10000) %>%
filter(!is.na(fy2019)) %>%
leaflet() %>%
addTiles() %>%
addCircleMarkers(clusterOptions = markerClusterOptions(),
label = ~paste0(name, ": ", scales::dollar(total_giving)))